/*
 * Copyright (c) 2009-2010, Code Aurora Forum. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/***************************************************************************
  Neon memset: Attempts to do a memset with Neon registers if possible,
     Inputs:
        s: The buffer to write to
        c: The integer data to write to the buffer
        n: The size_t count.
     Outputs:

***************************************************************************/

    .code 32
    .fpu neon
    .align 4
    .global memset32_neon
    .func

    /* r0 = buffer, r1 = value, r2 = times to write */
memset32_neon:
    cmp        r2, #1
    streq      r1, [r0], #4
    bxeq       lr

    cmp        r2, #4
    bgt        memset32_neon_start
    cmp        r2, #0
    bxeq       lr
memset32_neon_small:
    str        r1, [r0], #4
    subs       r2, r2, #1
    bne        memset32_neon_small
    bx         lr
memset32_neon_start:
    cmp        r2, #16
    blt        memset32_dropthru
    vdup.32    q0, r1
    vmov       q1, q0
    cmp        r2, #32
    blt        memset32_16
    cmp        r2, #64
    blt        memset32_32
    cmp        r2, #128
    blt        memset32_64
memset32_128:
    movs       r12, r2, lsr #7
memset32_loop128:
    subs        r12, r12, #1
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    bne         memset32_loop128
    ands        r2, r2, #0x7f
    bxeq        lr
memset32_64:
    movs        r12, r2, lsr #6
    beq         memset32_32
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    ands        r2, r2, #0x3f
    bxeq        lr
memset32_32:
    movs        r12, r2, lsr #5
    beq         memset32_16
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
    ands        r2, r2, #0x1f
    bxeq        lr
memset32_16:
    movs        r12, r2, lsr #4
    beq         memset32_dropthru
    and         r2, r2, #0xf
    vst1.64     {q0, q1}, [r0]!
    vst1.64     {q0, q1}, [r0]!
memset32_dropthru:
    rsb         r2, r2, #15
    add         pc, pc, r2, lsl #2
    nop
    str         r1, [r0, #56]
    str         r1, [r0, #52]
    str         r1, [r0, #48]
    str         r1, [r0, #44]
    str         r1, [r0, #40]
    str         r1, [r0, #36]
    str         r1, [r0, #32]
    str         r1, [r0, #28]
    str         r1, [r0, #24]
    str         r1, [r0, #20]
    str         r1, [r0, #16]
    str         r1, [r0, #12]
    str         r1, [r0, #8]
    str         r1, [r0, #4]
    str         r1, [r0, #0]
    bx          lr

    .endfunc
    .end
